import pandas as pd
from matplotlib import pyplot as plt
import jieba
from wordcloud import WordCloud


def load_stopwords():
    stopword = []
    data = []
    f = open('../data/stopword.txt', encoding='utf8')
    for line in f.readlines():
        data.append(line)
    for i in data:
        output = i.replace('\n', '')
        stopword.append(output)
    return stopword


def read_text(filepath) -> list:
    with open(filepath, encoding="utf8") as f:
        return f.read()


def remove_stopwords(text):
    stopwords = set(load_stopwords()).union(set(additional_stopwords))
    word_list = jieba.lcut(text)
    word_list = [word for word in word_list if word not in stopwords]
    return word_list


def term_freq(word_list, num=10):
    tf = pd.Series(word_list).value_counts() 
    return tf[tf > num]


def main():
    corpus = read_text(corpus_filepath)
    word_list = remove_stopwords(corpus)
    data = term_freq(word_list)

    wc = WordCloud(font_path='C:/Windows/Fonts/simhei.ttf',
                   width=1920,
                   height=1080,
                   mask=plt.imread(mask_fp))
    wc.fit_words(data)
    wc.to_file("wc.png")


if __name__ == "__main__":
    mask_fp = "../data/background.jpg"
    corpus_filepath = "../data/positive_samples.txt"
    additional_stopwords = ["label", "u3000", "\\", "\n", "!", "u30002"]
    main()
